Load libraries:

library(ggplot2)
library(readr)
library(RColorBrewer)
library(reshape2)
library(dplyr)
library(plotly)
library(scales)

Read the dataset:

df <- read_csv("data/gun-violence-data_01-2013_03-2018.csv")

There are a few columns we don’t care about - drop those

gun <- select(df, -c(incident_url, source_url, incident_url_fields_missing, congressional_district, sources, state_house_district, state_senate_district))

How many gun casualties (defined as number killed and injured) were there for each year?

gun %>% group_by(Year = format(date, "%Y")) %>% 
      summarize(n = sum(n_injured + n_killed)) %>% 
      ggplot(aes(x = Year, y = n)) + geom_col()

Clearly there are missing data from 2013, and 2018 isn’t complete yet. Let’s drop these two years. Let’s also separate the suicides.

gun <- filter(gun, format(date, "%Y") != 2018 & format(date, "%Y") != 2013)
suicide <- filter(gun, grepl("\\|\\|Suicide\\^", incident_characteristics)
                      & n_killed + n_injured < 2)

gun <- filter(gun, !(incident_id %in% suicide$incident_id))

Most Dangerous States

How many gun casualties were there in each state for 2014-2017?

cols <- brewer.pal(3, "Dark2")
gun %>% group_by(state) %>% 
      summarize(n_casualty = sum(n_killed + n_injured)) %>% 
      ggplot(aes(x = reorder(state, -n_casualty), y = n_casualty)) + 
      geom_col(fill = cols[1]) + 
      theme(axis.text.x  = element_text(angle=90, vjust=0.5, size = 14)) + 
      theme(axis.title.y = element_text(size = 16)) + 
      theme(plot.title = element_text(size = 16)) + 
      labs(x = NULL, y = "Casualties (Injuries + Fatalities)", 
           title = "Number of Gun Casualties in the US by State for 2014-2017")

The top 5 states with the highest casualty rate are:

  1. Illinois
  2. California
  3. Texas
  4. Florida
  5. Ohio

However, these are also pretty populated states. What we really want to know is the per capita rate, since population is an important factor. I got census data for each state from 2010-2017 from the U.S. Census Bureau, and then calculated the per capita rates (per hundred thousand people) for each year:

# READ IN US POPULATION DATA BY STATE
colNames <- c("id1", "id2", "state", "census2010", "cenbase2010", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017")
pop <- read_csv("data/US\ Population\ by\ State/PEP_2017_PEPANNRES_with_ann.csv", skip = 2, col_names = colNames)

# Only keep the columns and rows we want
pop <- select(pop, id1, state, "2014", "2015", "2016", "2017") %>% slice(6:n())

# Casualties by state and year; I removed Washington DC from the state rankings, since it's technically a city, not a state
casByState <- group_by(gun, state, year = format(date, "%Y")) %>% filter(state != "District of Columbia") %>% 
      summarize(n.casualty = sum(n_killed + n_injured), n.injured = sum(n_injured), n.killed = sum(n_killed))

# Add the population and the per capita rate (per 100K people)

# First, generate the population vector:
popList <- numeric()
for (i in 1:nrow(casByState)) {
      popList <- c(popList, as.numeric(pop[pop$state == casByState$state[i], casByState$year[i]]))
}

casByState <- ungroup(casByState) %>% mutate(population = popList, cas.per.capita100K = n.casualty/population*100000, k.per.capita100K = n.killed/population*100000, i.per.capita100K = n.injured/population*100000)

Now that we have the per-state data for each year, let’s take a look at it:

## # A tibble: 200 x 9
##    state  year  n.casualty n.injured n.killed population cas.per.capita10…
##    <chr>  <chr>      <int>     <int>    <int>      <dbl>             <dbl>
##  1 Alaba… 2014         914       591      323   4840037.             18.9 
##  2 Alaba… 2015         937       562      375   4850858.             19.3 
##  3 Alaba… 2016        1234       761      473   4860545.             25.4 
##  4 Alaba… 2017        1387       856      531   4874747.             28.5 
##  5 Alaska 2014          70        49       21    736759.              9.50
##  6 Alaska 2015         143        84       59    737979.             19.4 
##  7 Alaska 2016         170       103       67    741522.             22.9 
##  8 Alaska 2017         125        70       55    739795.             16.9 
##  9 Arizo… 2014         435       222      213   6706435.              6.49
## 10 Arizo… 2015         397       193      204   6802262.              5.84
## # ... with 190 more rows, and 2 more variables: k.per.capita100K <dbl>,
## #   i.per.capita100K <dbl>

To make plotting a little simpler, let’s take the mean of each per capita rate for each state across each year. So, for each state there will be four entries: 2014, 2015, 2016, and 2017.

meanCBS <- group_by(casByState, state) %>% summarize(meanCasRate = mean(cas.per.capita100K), meanKRate = mean(k.per.capita100K), meanIRate = mean(i.per.capita100K))

Now let’s plot it and see:

ggplot(data = meanCBS, aes(x = reorder(state, -meanCasRate), y = meanCasRate)) + 
      geom_col(fill = cols[2]) + 
      theme(axis.text.x  = element_text(angle = 90, vjust = 0.5, size = 14)) + 
      theme(axis.title.y = element_text(size = 16)) + 
      theme(plot.title = element_text(size = 16)) + 
      labs(x = NULL, y = "Mean Casualty Rate per 100,000 People", 
           title = "Mean Casualty Rate per 100,000 People by State (2014-2017)") +
      geom_hline(yintercept = median(meanCBS$meanCasRate), lwd = 1)

The black line represents the median value for all 50 states. The rankings have changed significantly. The top 5 states with the highest per capita casualty rate are:

  1. Louisiana
  2. Illinois
  3. Delaware
  4. Mississippi
  5. Alabama

The only state that remained in the top 5 after adjusting for population was Illinois, and that dropped one level.

Most Dangerous Cities

Let’s now look at the cities with the highest rates of per capita gun violence. I again used data from the US Census Bureau to find the population of cities in the United States. The dataset I used contained data from 2010-2016 for all cities greater than 50,000 residents. I had to do some manipulation of the city names in order to make them match the cities in the original dataset. Also, since there are no data for 2017, I just used the data for 2016 and assumed there wouldn’t be any significant changes.

# READ IN US POPULATION DATA BY CITY
colNames <- c("id1", "id2", "country", "tid1", "tid2", "rank", "geography1", "city", "census2010", "cenbase2010", "2010", "2011", "2012", "2013", "2014", "2015", "2016")
citypop <- read_csv("data/US\ Population\ by\ City/PEP_2016_PEPANNRSIP.US12A_with_ann.csv", skip = 2, col_names = colNames)

# Only keep the columns and rows we want (drop all the "government" and "county" entries)
citypop <- select(citypop, tid2, city, "2014", "2015", "2016") %>% filter(!grepl("[Cc]ounty|government", city))

# Reformatting city names
cityState <- sub("^(.+) (city( \\(balance\\))?|municipality|village|town), (.+)", "\\1, \\4", citypop$city) %>% strsplit(", ")

# split up the city column into city and state columns
cityState <- as.data.frame(matrix(unlist(cityState), nrow = length(cityState), byrow = TRUE), stringsAsFactors = FALSE)
names(cityState) <- c("city", "state")

# Ventura, CA has a nonstandard entry; fix it
cityState$city[grep("\\(", cityState$city)] <- "Ventura"

# Integrate the new columns back to the tibble and rearrange
# Also, since we are missing data for 2017, just reuse the 2016 column
citypop <- mutate(citypop, city = cityState$city, state = cityState$state, "2017" = citypop$`2016`) %>% select(tid2, city, state, "2014", "2015", "2016", "2017")

Get the data by city for each year. Note: some years will be missing because those cities did not have any reported gun crimes in those years. Although this will affect the mean values for cities without any reported gun crimes in some years, we are only going to look at the top 50 cities. For these cities to be in the top 50, it stands to reason that there will be crimes from all four years.

casByCity <- gun %>% group_by(city_or_county, state, year = format(date, "%Y")) %>% summarize(n.casualty = sum(n_killed + n_injured), n.killed = sum(n_killed), n.injured = sum(n_injured))
casByCity
## # A tibble: 30,980 x 6
## # Groups:   city_or_county, state [?]
##    city_or_county state          year  n.casualty n.killed n.injured
##    <chr>          <chr>          <chr>      <int>    <int>     <int>
##  1 Abbeville      Alabama        2016           1        0         1
##  2 Abbeville      Alabama        2017           1        1         0
##  3 Abbeville      Georgia        2017           0        0         0
##  4 Abbeville      Louisiana      2014           5        2         3
##  5 Abbeville      Louisiana      2015           5        0         5
##  6 Abbeville      Louisiana      2016           4        1         3
##  7 Abbeville      Louisiana      2017           4        1         3
##  8 Abbeville      South Carolina 2014           1        0         1
##  9 Abbeville      South Carolina 2016           1        0         1
## 10 Abbeville      South Carolina 2017           4        1         3
## # ... with 30,970 more rows

Now, we need a per capita estimate for each city. First we need to generate a vector of populations: figure out if the city in our data frame is actually in the population dataset, and if it is, grab the population for that particular year. Again, the 2017 population values are the same as the 2016 ones.

# Get a per capita (100K) estimate for each city
# First, generate the population vector:
popList <- numeric()
for (i in 1:nrow(casByCity)) {
      theCity <- casByCity$city_or_county[i]
      theState <- casByCity$state[i]
      theYear <- casByCity$year[i]
      ind <- which(citypop$city == theCity)
      
      if (length(ind) > 0 && theCity == citypop$city[ind] && theState == citypop$state[ind]) {
            popList <- c(popList, as.numeric(citypop[citypop$state ==
                                    theState & citypop$city == theCity, theYear]))
      } else {
            popList <- c(popList, NA)
      }
}

We’re also going to need the two letter abbreviations for each state for labels later, so let’s grab those and put them in the data frame as well. Don’t forget Washington, D.C.!:

# Get the short name for each state and attach it to the tibble
data(state)
abbs <- data.frame("abb" = state.abb, "state" = state.name, stringsAsFactors = FALSE)
abbs <- rbind(abbs, c("DC", "District of Columbia"), stringsAsFactors = FALSE)
abbList <- unlist(lapply(casByCity$state, function(x) abbs[which(abbs$state == x),]$abb))

Add the population and abbrevation columns, and calculate the mean values for each each state:

meanCBC <- ungroup(casByCity) %>% mutate(abb = abbList, population = popList, cas.per.capita100K = n.casualty/population*100000, k.per.capita100K = n.killed/population*100000, i.per.capita100K = n.injured/population*100000)

# Remove NA values and then compute the mean on the remaining values; sort by mean
meanCBC <- group_by(meanCBC, city_or_county, state, abb) %>% filter(!is.na(population)) %>% summarize(meanCasRate = mean(cas.per.capita100K), meanKRate = mean(k.per.capita100K), meanIRate = mean(i.per.capita100K)) %>% arrange(desc(meanCasRate))

meanCBC
## # A tibble: 700 x 6
## # Groups:   city_or_county, state [700]
##    city_or_county state       abb   meanCasRate meanKRate meanIRate
##    <chr>          <chr>       <chr>       <dbl>     <dbl>     <dbl>
##  1 Trenton        New Jersey  NJ           160.      23.7     137. 
##  2 New Orleans    Louisiana   LA           156.      41.6     114. 
##  3 Gary           Indiana     IN           151.      50.3     100. 
##  4 Baltimore      Maryland    MD           133.      40.4      92.6
##  5 Flint          Michigan    MI           132.      39.8      92.0
##  6 Savannah       Georgia     GA           110.      25.3      84.9
##  7 Chicago        Illinois    IL           110.      18.4      91.6
##  8 Birmingham     Alabama     AL           108.      39.1      69.2
##  9 Jackson        Mississippi MS           107.      32.0      75.4
## 10 Dayton         Ohio        OH           105.      23.1      82.3
## # ... with 690 more rows

Let’s plot the top 50:

top50 <- meanCBC[1:50,]

top50 <- arrange(top50, meanCasRate)
top50 <- mutate(top50, meanIRate = -meanIRate)
top50melt2 <- melt(top50[,c("city_or_county", "abb", "meanKRate", "meanIRate")], id.vars = c("city_or_county", "abb"))

ggplot(data = top50melt2) + geom_col(aes(x = city_or_county, y = value, fill = variable)) +
      scale_y_continuous(limits = c(-150,150), breaks = c(150, 100, 50, 0, -50, -100, -150), labels = c("150", "100", "50", "0", "50", "100", "150")) + 
      theme(axis.text.x  = element_text(hjust = .95, size = 12)) + 
      theme(axis.text.y = element_text(vjust = 0.5, size = 10)) +
      theme(axis.title.y = element_text(size = 14)) + 
      theme(plot.title = element_text(size = 16)) + 
      theme(legend.title = element_blank()) + 
      theme(legend.position = "bottom") + 
      theme(legend.text = element_text(size = 14)) + 
      scale_fill_discrete(labels = c("Fatalities", "Injuries"), guide = guide_legend(reverse = TRUE)) +
      labs(x = NULL, y = NULL) +
      labs(title = "Mean Injury/Fatality Rate per 100,000 People by City (2014-2017 where available)") + 
      scale_x_discrete(limits = top50$city_or_county, labels = paste(top50$city_or_county, sep = ", ", top50$abb)) +
      coord_flip()

According to the data, the top 5 cities with the most gun violence are:

  1. Trenton, NJ
  2. New Orleans, LA
  3. Gary, IN
  4. Baltimore, MD
  5. Flint, MI

Chicago comes in at 7th, even though Illinois was the second highest per capita state.

Effect of Gun Legislation on Gun Casualty Rates

One topic that is furiously being debated across the nation is whether or not to pass more/stricter gun laws on top of the ones that already exist. What is the effect of gun laws on casualty rate? Let’s analyze this both at the state level and city level.

By State

First, read the data for the state laws into R. The data were downloaded from the State Firearm Law Database (http://www.statefirearmlaws.org/). I had to reformat the files by re-saving into Excel because they weren’t actually saved as a CSV, so it wouldn’t have been easy to read it into R. You can download the reformatted CSV files from Github.

lawsdf <- read_csv("data/Laws/48_states_1991_data.csv")
lawsdf <- rbind(lawsdf, read_csv("data/Laws/Hawaii_1991_data.csv"))
lawsdf <- rbind(lawsdf, read_csv("data/Laws/Alaska_1991_data.csv"))

# Rename one of the columns to something easy to reference
lawsdf <- rename(lawsdf, law = `Law? 0/1`)
laws.by.year <- group_by(lawsdf, State, Year) %>% summarize(laws = sum(law))

# Grab the years we are interested in, 2014-2017, and take the average number of laws for the period
mean.laws.by.year <- filter(laws.by.year, Year >= 2014) %>% summarize(meanlaw = mean(laws))

For a first pass, all we are looking at is the absolute number of laws on the books in each state. For simplicity, we assume that each law is as effective as the next (although that is probably not quite true, and each law’s effectiveness probably varies across each state/municipality and has a number of confounding variables). Let’s look at the distribution of the number of laws on the books for all 50 states:

# Histogram of number of laws
ggplot(data = mean.laws.by.year, aes(x = meanlaw)) + geom_histogram(binwidth = 10, col = "black") + labs(x = "Mean Number of Laws, 2014-2017", y = "Frequency")

It appears that half the states have around 15 or fewer laws on the books, with the average number of laws being around 25.

Is there a correlation between the number of laws in each state and their mean per capita casualty rate?

meanCBS <- arrange(meanCBS, state)
mean.laws.by.year <- arrange(mean.laws.by.year, State)
meanCBS <- mutate(meanCBS, numlaws = mean.laws.by.year$meanlaw)
qplot(x = numlaws, y = meanCasRate, data = meanCBS, xlab = "Number of Laws", ylab = "Mean Casualty Rate Per 100,000 People")

It doesn’t appear so. Some states have few laws on the books and have a lower casualty rate, whereas others achieve the low casualty rate with a higher than average number of laws. There are also a lot of states with a high casualty rate and few laws. It could be argued that the states with more laws would actually be more violent if the laws didn’t exist, but it is difficult to tease out that effect.

Let’s look at how each state ranks in terms of casualty rate and the number of gun laws:

g <- ggplot(data = meanCBS, aes(x = reorder(state, -meanCasRate), y = meanCasRate, fill = numlaws))
p2 <- g + geom_col(color = "gray") + 
      theme(axis.text.x  = element_text(angle=90, vjust=0.5, size = 14)) + 
      theme(axis.title.y = element_text(size = 16)) + 
      theme(plot.title = element_text(size = 16)) + 
      labs(x = NULL, y = "Mean Casualty Rate per 100,000 People", 
           title = "Mean Casualty Rate per 100,000 People by State (2014-2017)")

pcaswithlaw <- p2 + scale_fill_gradient2(low = muted("red"), mid = "white", high = muted("blue"), midpoint = max(meanCBS$numlaws)/2, name = "Avg. Number of Gun Laws", guide = guide_colorbar(direction = "horizontal", title.position = "top", barwidth = 10, title.hjust = .5)) + 
      theme(legend.background = element_rect(fill="gray90", size=.3, color = "black", linetype = 1)) +
      theme(legend.justification=c(1,1), legend.position=c(1,1)) +
      geom_hline(yintercept = median(meanCBS$meanCasRate), lwd = 1)
pcaswithlaw

The top 7 states with the highest average number of laws from 2014-2017 are California (103), Massachusetts (100), Connecticut (86.5), New York (75), New Jersey (68.8), and Illinois (64). Of these seven, two states are above the median casualty rate for the US (Illinois and Maryland) - the other five rank a little below the median rate. It appears there may be a weak association between number of laws and state casualty rate, but there are probably other factors at play as well, such as population density. There are a number of states below and above the median casualty rate with fewer gun laws, so it is difficult to conclude that more gun laws translate to less gun violence.

By City

Let’s look at the data by city and try to figure out if there is a correlation between the number of laws in a particular state and the per capita casualty rates for cities in that state:

# First, remove DC since we don't have data on the number of gun laws there
meanCBClaws <- filter(meanCBC, state != "District of Columbia")
lawlist <- sapply(meanCBClaws$state, function(x) mean.laws.by.year[mean.laws.by.year$State == x,2])

meanCBClaws <- ungroup(meanCBClaws) %>% mutate("numlaws" = round(unlist(lawlist), digits = 1), meanCasRate = round(meanCasRate, digits = 1), meanKRate = round(meanKRate, digits = 1), meanIRate = round(meanIRate, digits = 1))

oldnames <- names(meanCBClaws) # Save the current column names for later

# Do some renaming to make things easier to understand
names(meanCBClaws) <- c("City", "state.long", "State", "Casualty Rate", "Fatality Rate", "Injury Rate", "Number of Laws")

p.allcities <- ggplot(data = meanCBClaws, aes(x = `Number of Laws`, y = `Casualty Rate`, color = State)) +
      geom_point(aes(text = paste(City, ", ", State, "<br>Number of Laws: ", `Number of Laws`, "<br>Casualty Rate: ", `Casualty Rate`, "<br>Fatality Rate: ", `Fatality Rate`, "<br>Injury Rate: ", `Injury Rate`, sep = ""))) + 
      labs(x = "Number of Laws") + 
      labs (y = "Casualty Rate (Per 100,000 People)") +
      theme(legend.position = "none")

ggplotly(p.allcities, tooltip = c("text"))

There are 699 cities displayed here (there are actually a lot more cities throughout the US where gun violence occurred, but remember, the population data was only available for cities over 50,000 people). It is clear that in a given state, there is a wide distribution in the casualty rates. You can hover over each point to see the data it represents.

Let’s look at a box and whisker plot in order to see the distribution in casualty rates for each city in a particular state.

ggplot(data = meanCBClaws, aes(x = reorder(state.long, -`Casualty Rate`), y = `Casualty Rate`, fill = `Number of Laws`)) +
      geom_boxplot() + 
      labs(x = NULL) + 
      labs (y = "Casualty Rate (Per 100,000 People)") +
      scale_fill_gradient2(midpoint = max(meanCBClaws$`Number of Laws`)/2, name = "Avg. Number of State Gun Laws", guide = guide_colorbar(direction = "horizontal", title.position = "top", barwidth = 10, title.hjust = .5)) + 
      theme(legend.background = element_rect(fill="gray90", size=.3, color = "black", linetype = 1)) +
      theme(legend.justification=c(1,1), legend.position=c(1,1)) +
      theme(axis.text.x  = element_text(angle=90, vjust=0.5, size = 14)) + 
      theme(axis.title.y = element_text(size = 16)) + 
      theme(plot.title = element_text(size = 16))